Outlier Detection with bqplot


In this notebook, we create a class DNA that leverages the new bqplot canvas based HeatMap along with the ipywidgets Range Slider to help us detect and clean outliers in our data. The class accepts a DataFrame and allows you to visually and programatically filter your outliers. The cleaned DataFrame can then be retrieved through a simple convenience function.


In [ ]:
from scipy.stats import percentileofscore
from scipy.interpolate import interp1d
import bqplot.pyplot as plt
from bqplot import *
from traitlets import List, Float, observe
from ipywidgets import IntRangeSlider, Layout, VBox, HBox, jslink
from pandas import DatetimeIndex
import numpy as np
import pandas as pd

def quantile_space(x, q1=0.1, q2=0.9):
    '''
    Returns a function that squashes quantiles between q1 and q2
    '''
    q1_x, q2_x = np.percentile(x, [q1, q2])
    qs = np.percentile(x, np.linspace(0, 100, 100))
    def get_quantile(t):
        return np.interp(t, qs, np.linspace(0, 100, 100))
    def f(y):
        return np.interp(get_quantile(y), [0, q1, q2, 100], [-1, 0, 0, 1])
    return f

class DNA(VBox):
    
    colors = List()
    q1 = Float()
    q2 = Float()
    
    def __init__(self, data, **kwargs):
        self.data = data
        date_x, date_y = False, False
        transpose = kwargs.pop('transpose', False)
        if transpose is True:
            if type(data.index) is DatetimeIndex:
                self.x_scale = DateScale()
            if type(data.columns) is DatetimeIndex:
                self.y_scale = DateScale()
            x, y = list(data.columns.values), data.index.values
        else:
            if type(data.index) is DatetimeIndex:
                date_x = True
            if type(data.columns) is DatetimeIndex:
                date_y = True
            x, y = data.index.values, list(data.columns.values)
            
        self.q1, self.q2 = kwargs.pop('quantiles', (1, 99))
        
        self.quant_func = quantile_space(self.data.values.flatten(), q1=self.q1, q2=self.q2)
        self.colors = kwargs.pop('colors', ['Red', 'Black', 'Green'])
        
        self.x_scale = DateScale() if date_x is True else LinearScale()
        self.y_scale = DateScale() if date_y is True else OrdinalScale(padding_y=0)
        self.color_scale = ColorScale(colors=self.colors)
        self.heat_map = HeatMap(color=self.quant_func(self.data.T), x=x, y=y, scales={'x': self.x_scale, 'y': self.y_scale,
                                                                               'color': self.color_scale})
        self.x_ax = Axis(scale=self.x_scale)
        self.y_ax = Axis(scale=self.y_scale, orientation='vertical')
        show_axes = kwargs.pop('show_axes', True)
        self.axes = [self.x_ax, self.y_ax] if show_axes is True else []
        
        self.height = kwargs.pop('height', '800px')
        self.layout = kwargs.pop('layout', Layout(width='100%', height=self.height, flex='1'))
        self.fig_margin = kwargs.pop('fig_margin', {'top': 60, 'bottom': 60, 'left': 150, 'right': 0})
        kwargs.setdefault('padding_y', 0.0)
        
        self.create_interaction(**kwargs)
        
        self.figure = Figure(marks=[self.heat_map], axes=self.axes, fig_margin=self.fig_margin, 
                             layout=self.layout, min_aspect_ratio=0.,**kwargs)
        
        super(VBox, self).__init__(children=[self.range_slider, self.figure], layout=Layout(align_items='center',
                                                                                           width='100%',
                                                                                           height='100%'),
                                   **kwargs)
        
    def create_interaction(self, **kwargs):
        self.range_slider = IntRangeSlider(description='Filter Range', value=(self.q1, self.q2), layout=Layout(width='100%'))
        self.range_slider.observe(self.slid_changed, 'value')
        self.observe(self.changed, ['q1', 'q2'])
        
        
    def slid_changed(self, new):
        self.q1 = self.range_slider.value[0]
        self.q2 = self.range_slider.value[1]
        
    def changed(self, new):
        self.range_slider.value = (self.q1, self.q2)
        
        self.quant_func = quantile_space(self.data.values.flatten(), q1=self.q1, q2=self.q2)
        self.heat_map.color = self.quant_func(self.data.T)
        
    def get_filtered_df(self, fill_type='median'):
        q1_x, q2_x = np.percentile(self.data, [self.q1, self.q2])
        if fill_type == 'median':
            return self.data[(self.data >= q1_x) & (self.data <= q2_x)].apply(lambda x: x.fillna(x.median()))
        elif fill_type == 'mean':
            return self.data[(self.data >= q1_x) & (self.data <= q2_x)].apply(lambda x: x.fillna(x.mean()))
        else:
            raise ValueError("fill_type must be one of ('median', 'mean')")

We define the size of our matrix here. Larger matrices require a larger height.


In [ ]:
size = 100

In [ ]:
def num_to_col_letters(num):
    letters = ''
    while num:
        mod = (num - 1) % 26
        letters += chr(mod + 65)
        num = (num - 1) // 26
    return ''.join(reversed(letters))

letters = []

for i in range(1, size+1):
    letters.append(num_to_col_letters(i))

In [ ]:
data = pd.DataFrame(np.random.randn(size, size), columns=letters)

In [ ]:
data_dna = DNA(data, title='DNA of our Data', height='1400px', colors=['Red', 'White', 'Green'])
data_dna

Instead of setting the quantiles by the sliders, we can also set them programatically. Using a range of (5, 95) restricts the data considerably.


In [ ]:
data_dna.q1, data_dna.q2 = 5, 95

Now, we can use the convenience function to extract a clean DataFrame.


In [ ]:
data_clean = data_dna.get_filtered_df()

The DNA fills outliers with the mean of the column. Alternately, we can fill the outliers by the mean.


In [ ]:
data_mean = data_dna.get_filtered_df(fill_type='mean')

We can also visualize the new DataFrame the same way to test how our outliers look now.


In [ ]:
DNA(data_clean, title='Cleaned Data', height='1200px', colors=['Red', 'White', 'Green'])

In [ ]: